import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import os

os.chdir("C:/Users/agusv/Desktop/Estudio/Tesis/Csv")
data = pd.read_csv("multivariate_lng.csv", parse_dates=['Date'])
data.set_index('Date', inplace=True)
lng_prices = data['LNG Price']

train_data = data[data.index.year >= 2023]
test_data = data[data.index.year == 2024]

#Seasonality evaluation with method
"""
decomposition = seasonal_decompose(lng_prices, model='additive', period=12)
adf_test = adfuller(lng_prices.dropna())
print(f"ADF Test Statistic: {adf_test[0]}, P-value: {adf_test[1]}")
is_seasonal = adf_test[1] >= 0.05

if is_seasonal:
    print("La serie de precios de LNG no es estacionaria y puede tener estacionalidad.")
    data['Month'] = data.index.month
    data_with_dummies = pd.get_dummies(data, columns=['Month'], drop_first=True)
else:
    print("La serie de precios de LNG es estacionaria. No se añadirá componente estacional.")
    data_with_dummies = data.copy()
"""

data['Month'] = data.index.month
seasonal_dummies = pd.get_dummies(data, columns=['Month'], drop_first=True)

X_training = seasonal_dummies[seasonal_dummies.index.year >= 2023][['Brent Spot Price', 'Inflation', 'USD/EUR'] + [f'Month_{i}' for i in range(2, 13)]]
y_training = seasonal_dummies[seasonal_dummies.index.year >= 2023]['LNG Price']
X_testing = seasonal_dummies[seasonal_dummies.index.year == 2024][['Brent Spot Price', 'Inflation', 'USD/EUR'] + [f'Month_{i}' for i in range(2, 13)]]
y_testing = seasonal_dummies[seasonal_dummies.index.year == 2024]['LNG Price']

#Parameters with auto arima 101010       
arimax_model = SARIMAX(y_training, exog=X_training, order=(1, 0, 1), seasonal_order=(0, 1, 0, 12))
arimax_results = arimax_model.fit(disp=False)
y_p_arimax = arimax_results.predict(start=X_testing.index[0], end=X_testing.index[-1], exog=X_testing)
rmse_arimax = np.sqrt(mean_squared_error(y_testing, y_p_arimax))
mape_arimax = np.mean(np.abs((y_testing - y_p_arimax) / y_testing)) * 100

linear_model = LinearRegression().fit(X_training, y_training)
y_pred_linear = linear_model.predict(X_testing)
rmse_linear = np.sqrt(mean_squared_error(y_testing, y_pred_linear))
mape_linear = np.mean(np.abs((y_testing - y_pred_linear) / y_testing)) * 100

residuals_linear = y_testing - y_pred_linear

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_testing, residuals_linear, test_size=0.2, random_state=42)
xgboost_model_linear = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=3,
                                    min_child_weight=5, subsample=0.8, colsample_bytree=0.8, random_state=42)
xgboost_model_linear.fit(X_train_split, y_train_split)

y_pred_linear_xgb = y_pred_linear + xgboost_model_linear.predict(X_testing)
rmse_linear_xgb = np.sqrt(mean_squared_error(y_testing, y_pred_linear_xgb))
mape_linear_xgb = np.mean(np.abs((y_testing - y_pred_linear_xgb) / y_testing)) * 100

from pmdarima import auto_arima

"""
sarima_residuals_model = auto_arima(residuals_linear,seasonal=False,trace=True)
print(sarima_residuals_model.summary())
"""

sarimax_model_residuals = SARIMAX(residuals_linear, order=(2, 0, 1))
sarimax_residuals_results = sarimax_model_residuals.fit(disp=False)
y_pred_linear_sarimax = y_pred_linear + sarimax_residuals_results.predict(start=X_testing.index[0], end=X_testing.index[-1])
rmse_linear_sarimax = np.sqrt(mean_squared_error(y_testing, y_pred_linear_sarimax))
mape_linear_sarimax = np.mean(np.abs((y_testing - y_pred_linear_sarimax) / y_testing)) * 100

plt.figure(figsize=(14, 6))
plt.plot(lng_prices[lng_prices.index.year >= 2024], label="Real Price LNG [€/kg]", color='green')
plt.plot(test_data.index, y_pred_linear, label="Lineal", color='orange', linestyle='--')
plt.plot(test_data.index, y_p_arimax, label="ARIMAX", color='red', linestyle='--')
plt.plot(test_data.index, y_pred_linear_xgb, label="Lineal + XGBoost", color='purple', linestyle='--')
plt.plot(test_data.index, y_pred_linear_sarimax, label="Lineal + SARIMAX", color='pink', linestyle='--')

plt.xlabel("Date")
plt.ylabel("Price LNG [€/kg]")
plt.title("Prediction comparison")
plt.legend()
plt.show()

error_analysing = {
    'Model': ['ARIMAX', 'Linear', 'Linear + XGBoost', 'Linear + SARIMAX'],
    'MAPE [%]': [mape_arimax, mape_linear, mape_linear_xgb, mape_linear_sarimax],
    'RMSE': [rmse_arimax, rmse_linear, rmse_linear_xgb, rmse_linear_sarimax]
}
errores = pd.DataFrame(error_analysing)
print(errores)

"""
plt.figure(figsize=(14, 6))
plt.plot(lng_prices[lng_prices.index.year >= 2020], label="Real Price LNG [€/kg]", color='green')
plt.xlabel("Date")
plt.ylabel("Price LNG [€/kg]")
plt.title("LNG Price Historical Data")
plt.legend()
plt.show()
"""

"""
error_percentages = {
    'ARIMAX': mae_arimax / y_testing.mean() * 100,
    'Lineal': mae_linear / y_testing.mean() * 100,
    'Lineal + XGBoost': mae_linear_xgb / y_testing.mean() * 100,
    'Lineal + SARIMAX': mae_linear_sarimax / y_testing.mean() * 100
}

plt.figure(figsize=(10, 6))
plt.bar(error_percentages.keys(), error_percentages.values(), color=['red', 'orange', 'green', 'purple', 'brown', 'pink', 'cyan'])
plt.ylabel("Mean Absolute Error [%]")
plt.title("Mean Absolute Error of Models")
plt.show()

#When we get the method that has the least MAE, in this case Lineal + SARIMAX
#We get to provide some future values to the exogenous variables

dataf = pd.read_csv("multivariate_lng.csv", parse_dates=['Date'])
dataf.set_index('Date', inplace=True)
dataf = dataf[dataf.index >= '2023-05-01']

brent_series = dataf['Brent Spot Price']
ipc_series = dataf['Variation']

auto_arima(brent_series).summary()
auto_arima(ipc_series).summary()

sarimax_results_brent = SARIMAX(brent_series, order=(0, 1, 0)).fit(disp=False)
sarimax_results_ipc = SARIMAX(ipc_series, order=(2, 0, 1)).fit(disp=False)

future_steps = 12
future_index = pd.date_range(start=dataf.index[-1] + pd.DateOffset(months=1), periods=future_steps, freq='MS')
brent_forecast = sarimax_results_brent.predict(start=future_index[0], end=future_index[-1])
ipc_forecast = sarimax_results_ipc.predict(start=future_index[0], end=future_index[-1])

future_var = pd.DataFrame({
    'Brent Spot Price': brent_forecast,
    'Variation': ipc_forecast
}, index=future_index)

X_training = dataf[['Brent Spot Price', 'Variation']]
y_training = dataf['LNG Price']
linear_model = LinearRegression().fit(X_training, y_training)
linear_predictions = linear_model.predict(future_var)

residuals = y_training - linear_model.predict(X_training)
sarimax_residuals_results = SARIMAX(residuals, order=(2, 0, 1)).fit(disp=False)

residuals_forecast = sarimax_residuals_results.predict(start=future_index[0], end=future_index[-1])
final_predictions = linear_predictions + residuals_forecast

confidence_interval = 0.05
upper_bound = final_predictions * (1 + confidence_interval)
lower_bound = final_predictions * (1 - confidence_interval)

predictions_df = pd.DataFrame({
    'Date': future_index,
    'Predicted LNG Price': final_predictions,
    'Upper Bound': upper_bound,
    'Lower Bound': lower_bound
})

plt.figure(figsize=(14, 6))
plt.plot(dataf['LNG Price'], label="Real LNG Price", color="blue")
plt.plot(future_index, final_predictions, label="Linear + SARIMAX Forecast", color="red", linestyle="--")
plt.scatter(future_index, final_predictions, color='blue', zorder=5)
plt.fill_between(future_index, lower_bound, upper_bound, color='orange', alpha=0.3)
plt.xlabel("Date")
plt.ylabel("LNG Price [€/kg]")
plt.title("LNG Price Prediction (Linear + SARIMAX)")
plt.legend()
plt.show()
"""